/*
 * Copyright (C) 2024 Michael Brown <mbrown@fensystems.co.uk>.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA.
 *
 * You can also choose to distribute this program under the terms of
 * the Unmodified Binary Distribution Licence (as given in the file
 * COPYING.UBDL), provided that you have satisfied its requirements.
 */

FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <byteswap.h>
#include <ipxe/netdevice.h>
#include <ipxe/ethernet.h>
#include <ipxe/if_ether.h>
#include <ipxe/iobuf.h>
#include <ipxe/dma.h>
#include <ipxe/pci.h>
#include <ipxe/fault.h>
#include "gve.h"

/** @file
 *
 * Google Virtual Ethernet network driver
 *
 */

/* Disambiguate the various error causes */
#define EINFO_EIO_ADMIN_UNSET						\
	__einfo_uniqify ( EINFO_EIO, 0x00, "Uncompleted" )
#define EIO_ADMIN_UNSET							\
	__einfo_error ( EINFO_EIO_ADMIN_UNSET )
#define EINFO_EIO_ADMIN_ABORTED						\
	__einfo_uniqify ( EINFO_EIO, 0x10, "Aborted" )
#define EIO_ADMIN_ABORTED						\
	__einfo_error ( EINFO_EIO_ADMIN_ABORTED )
#define EINFO_EIO_ADMIN_EXISTS						\
	__einfo_uniqify ( EINFO_EIO, 0x11, "Already exists" )
#define EIO_ADMIN_EXISTS						\
	__einfo_error ( EINFO_EIO_ADMIN_EXISTS )
#define EINFO_EIO_ADMIN_CANCELLED					\
	__einfo_uniqify ( EINFO_EIO, 0x12, "Cancelled" )
#define EIO_ADMIN_CANCELLED						\
	__einfo_error ( EINFO_EIO_ADMIN_CANCELLED )
#define EINFO_EIO_ADMIN_DATALOSS					\
	__einfo_uniqify ( EINFO_EIO, 0x13, "Data loss" )
#define EIO_ADMIN_DATALOSS						\
	__einfo_error ( EINFO_EIO_ADMIN_DATALOSS )
#define EINFO_EIO_ADMIN_DEADLINE					\
	__einfo_uniqify ( EINFO_EIO, 0x14, "Deadline exceeded" )
#define EIO_ADMIN_DEADLINE						\
	__einfo_error ( EINFO_EIO_ADMIN_DEADLINE )
#define EINFO_EIO_ADMIN_PRECONDITION					\
	__einfo_uniqify ( EINFO_EIO, 0x15, "Failed precondition" )
#define EIO_ADMIN_PRECONDITION						\
	__einfo_error ( EINFO_EIO_ADMIN_PRECONDITION )
#define EINFO_EIO_ADMIN_INTERNAL					\
	__einfo_uniqify ( EINFO_EIO, 0x16, "Internal error" )
#define EIO_ADMIN_INTERNAL						\
	__einfo_error ( EINFO_EIO_ADMIN_INTERNAL )
#define EINFO_EIO_ADMIN_INVAL						\
	__einfo_uniqify ( EINFO_EIO, 0x17, "Invalid argument" )
#define EIO_ADMIN_INVAL							\
	__einfo_error ( EINFO_EIO_ADMIN_INVAL )
#define EINFO_EIO_ADMIN_NOT_FOUND					\
	__einfo_uniqify ( EINFO_EIO, 0x18, "Not found" )
#define EIO_ADMIN_NOT_FOUND						\
	__einfo_error ( EINFO_EIO_ADMIN_NOT_FOUND )
#define EINFO_EIO_ADMIN_RANGE						\
	__einfo_uniqify ( EINFO_EIO, 0x19, "Out of range" )
#define EIO_ADMIN_RANGE							\
	__einfo_error ( EINFO_EIO_ADMIN_RANGE )
#define EINFO_EIO_ADMIN_PERM						\
	__einfo_uniqify ( EINFO_EIO, 0x1a, "Permission denied" )
#define EIO_ADMIN_PERM							\
	__einfo_error ( EINFO_EIO_ADMIN_PERM )
#define EINFO_EIO_ADMIN_UNAUTH						\
	__einfo_uniqify ( EINFO_EIO, 0x1b, "Unauthenticated" )
#define EIO_ADMIN_UNAUTH						\
	__einfo_error ( EINFO_EIO_ADMIN_UNAUTH )
#define EINFO_EIO_ADMIN_RESOURCE					\
	__einfo_uniqify ( EINFO_EIO, 0x1c, "Resource exhausted" )
#define EIO_ADMIN_RESOURCE						\
	__einfo_error ( EINFO_EIO_ADMIN_RESOURCE )
#define EINFO_EIO_ADMIN_UNAVAIL						\
	__einfo_uniqify ( EINFO_EIO, 0x1d, "Unavailable" )
#define EIO_ADMIN_UNAVAIL						\
	__einfo_error ( EINFO_EIO_ADMIN_UNAVAIL )
#define EINFO_EIO_ADMIN_NOTSUP						\
	__einfo_uniqify ( EINFO_EIO, 0x1e, "Unimplemented" )
#define EIO_ADMIN_NOTSUP	       					\
	__einfo_error ( EINFO_EIO_ADMIN_NOTSUP )
#define EINFO_EIO_ADMIN_UNKNOWN						\
	__einfo_uniqify ( EINFO_EIO, 0x1f, "Unknown error" )
#define EIO_ADMIN_UNKNOWN						\
	__einfo_error ( EINFO_EIO_ADMIN_UNKNOWN )
#define EIO_ADMIN( status )						\
	EUNIQ ( EINFO_EIO, ( (status) & 0x1f ),				\
		EIO_ADMIN_UNSET, EIO_ADMIN_ABORTED, EIO_ADMIN_EXISTS,	\
		EIO_ADMIN_CANCELLED, EIO_ADMIN_DATALOSS,		\
		EIO_ADMIN_DEADLINE, EIO_ADMIN_PRECONDITION,		\
		EIO_ADMIN_INTERNAL, EIO_ADMIN_INVAL,			\
		EIO_ADMIN_NOT_FOUND, EIO_ADMIN_RANGE, EIO_ADMIN_PERM,	\
		EIO_ADMIN_UNAUTH, EIO_ADMIN_RESOURCE,			\
		EIO_ADMIN_UNAVAIL, EIO_ADMIN_NOTSUP, EIO_ADMIN_UNKNOWN )

/******************************************************************************
 *
 * Buffer layout
 *
 ******************************************************************************
 */

/**
 * Get buffer offset (within queue page list allocation)
 *
 * @v queue		Descriptor queue
 * @v tag		Buffer tag
 * @ret addr		Buffer address within queue page list address space
 */
static inline __attribute__ (( always_inline)) size_t
gve_offset ( struct gve_queue *queue, unsigned int tag ) {

	/* We allocate sufficient pages for the maximum fill level of
	 * buffers, and reuse the buffers in strict rotation as they
	 * are released by the hardware.
	 */
	assert ( tag < queue->fill );
	return ( tag * GVE_BUF_SIZE );
}

/**
 * Get buffer address (within queue page list address space)
 *
 * @v queue		Descriptor queue
 * @v tag		Buffer tag
 * @ret addr		Buffer address within queue page list address space
 */
static inline __attribute__ (( always_inline)) physaddr_t
gve_address ( struct gve_queue *queue, unsigned int tag ) {

	/* Pages are allocated as a single contiguous block */
	return ( queue->qpl.base + gve_offset ( queue, tag ) );
}

/**
 * Get buffer address
 *
 * @v queue		Descriptor queue
 * @v tag		Buffer tag
 * @ret addr		Buffer address
 */
static inline __attribute__ (( always_inline )) void *
gve_buffer ( struct gve_queue *queue, unsigned int tag ) {

	/* Pages are allocated as a single contiguous block */
	return ( queue->qpl.data + gve_offset ( queue, tag ) );
}

/******************************************************************************
 *
 * Device reset
 *
 ******************************************************************************
 */

/**
 * Reset hardware
 *
 * @v gve		GVE device
 * @ret rc		Return status code
 */
static int gve_reset ( struct gve_nic *gve ) {
	uint32_t pfn;
	unsigned int i;

	/* Skip reset if admin queue page frame number is already
	 * clear.  Triggering a reset on an already-reset device seems
	 * to cause a delayed reset to be scheduled.  This can cause
	 * the device to end up in a reset loop, where each attempt to
	 * recover from reset triggers another reset a few seconds
	 * later.
	 */
	pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
	if ( ! pfn ) {
		DBGC ( gve, "GVE %p skipping reset\n", gve );
		return 0;
	}

	/* Clear admin queue page frame number */
	writel ( 0, gve->cfg + GVE_CFG_ADMIN_PFN );
	wmb();

	/* Wait for device to reset */
	for ( i = 0 ; i < GVE_RESET_MAX_WAIT_MS ; i++ ) {

		/* Delay */
		mdelay ( 1 );

		/* Check for reset completion */
		pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
		if ( ! pfn )
			return 0;
	}

	DBGC ( gve, "GVE %p reset timed out (PFN %#08x devstat %#08x)\n",
	       gve, bswap_32 ( pfn ),
	       bswap_32 ( readl ( gve->cfg + GVE_CFG_DEVSTAT ) ) );
	return -ETIMEDOUT;
}

/******************************************************************************
 *
 * Admin queue
 *
 ******************************************************************************
 */

/**
 * Get operating mode name (for debugging)
 *
 * @v mode		Operating mode
 * @ret name		Mode name
 */
static inline const char * gve_mode_name ( unsigned int mode ) {
	static char buf[ 8 /* "XXX-XXX" + NUL */ ];

	snprintf ( buf, sizeof ( buf ), "%s-%s",
		   ( ( mode & GVE_MODE_DQO ) ? "DQO" : "GQI" ),
		   ( ( mode & GVE_MODE_QPL ) ? "QPL" : "RDA" ) );
	return buf;
}

/**
 * Allocate admin queue
 *
 * @v gve		GVE device
 * @ret rc		Return status code
 */
static int gve_admin_alloc ( struct gve_nic *gve ) {
	struct dma_device *dma = gve->dma;
	struct gve_admin *admin = &gve->admin;
	struct gve_scratch *scratch = &gve->scratch;
	size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) );
	size_t scratch_len = sizeof ( *scratch->buf );
	int rc;

	/* Allocate admin queue */
	admin->cmd = dma_alloc ( dma, &admin->map, admin_len, GVE_ALIGN );
	if ( ! admin->cmd ) {
		rc = -ENOMEM;
		goto err_admin;
	}

	/* Allocate scratch buffer */
	scratch->buf = dma_alloc ( dma, &scratch->map, scratch_len, GVE_ALIGN );
	if ( ! scratch->buf ) {
		rc = -ENOMEM;
		goto err_scratch;
	}

	DBGC ( gve, "GVE %p AQ at [%08lx,%08lx) scratch [%08lx,%08lx)\n",
	       gve, virt_to_phys ( admin->cmd ),
	       ( virt_to_phys ( admin->cmd ) + admin_len ),
	       virt_to_phys ( scratch->buf ),
	       ( virt_to_phys ( scratch->buf ) + scratch_len ) );
	return 0;

	dma_free ( &scratch->map, scratch->buf, scratch_len );
 err_scratch:
	dma_free ( &admin->map, admin->cmd, admin_len );
 err_admin:
	return rc;
}

/**
 * Free admin queue
 *
 * @v gve		GVE device
 */
static void gve_admin_free ( struct gve_nic *gve ) {
	struct gve_admin *admin = &gve->admin;
	struct gve_scratch *scratch = &gve->scratch;
	size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) );
	size_t scratch_len = sizeof ( *scratch->buf );

	/* Free scratch buffer */
	dma_free ( &scratch->map, scratch->buf, scratch_len );

	/* Free admin queue */
	dma_free ( &admin->map, admin->cmd, admin_len );
}

/**
 * Enable admin queue
 *
 * @v gve		GVE device
 */
static void gve_admin_enable ( struct gve_nic *gve ) {
	struct gve_admin *admin = &gve->admin;
	size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) );
	physaddr_t base;

	/* Reset queue */
	admin->prod = 0;

	/* Program queue addresses and capabilities */
	base = dma ( &admin->map, admin->cmd );
	writel ( bswap_32 ( base / GVE_PAGE_SIZE ),
		 gve->cfg + GVE_CFG_ADMIN_PFN );
	writel ( bswap_32 ( base & 0xffffffffUL ),
		 gve->cfg + GVE_CFG_ADMIN_BASE_LO );
	if ( sizeof ( base ) > sizeof ( uint32_t ) ) {
		writel ( bswap_32 ( ( ( uint64_t ) base ) >> 32 ),
			 gve->cfg + GVE_CFG_ADMIN_BASE_HI );
	} else {
		writel ( 0, gve->cfg + GVE_CFG_ADMIN_BASE_HI );
	}
	writel ( bswap_16 ( admin_len ), gve->cfg + GVE_CFG_ADMIN_LEN );
	writel ( bswap_32 ( GVE_CFG_DRVSTAT_RUN ), gve->cfg + GVE_CFG_DRVSTAT );
}

/**
 * Get next available admin queue command slot
 *
 * @v gve		GVE device
 * @ret cmd		Admin queue command
 */
static union gve_admin_command * gve_admin_command ( struct gve_nic *gve ) {
	struct gve_admin *admin = &gve->admin;
	union gve_admin_command *cmd;
	unsigned int index;

	/* Get next command slot */
	index = admin->prod;
	cmd = &admin->cmd[ index % GVE_ADMIN_COUNT ];

	/* Initialise request */
	memset ( cmd, 0, sizeof ( *cmd ) );

	return cmd;
}

/**
 * Wait for admin queue command to complete
 *
 * @v gve		GVE device
 * @ret rc		Return status code
 */
static int gve_admin_wait ( struct gve_nic *gve ) {
	struct gve_admin *admin = &gve->admin;
	uint32_t evt;
	uint32_t pfn;
	unsigned int i;

	/* Wait for any outstanding commands to complete */
	for ( i = 0 ; i < GVE_ADMIN_MAX_WAIT_MS ; i++ ) {

		/* Check event counter */
		rmb();
		evt = bswap_32 ( readl ( gve->cfg + GVE_CFG_ADMIN_EVT ) );
		if ( evt == admin->prod )
			return 0;

		/* Check for device reset */
		pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
		if ( ! pfn )
			break;

		/* Delay */
		mdelay ( 1 );
	}

	DBGC ( gve, "GVE %p AQ %#02x %s (completed %#02x, status %#08x)\n",
	       gve, admin->prod, ( pfn ? "timed out" : "saw reset" ), evt,
	       bswap_32 ( readl ( gve->cfg + GVE_CFG_DEVSTAT ) ) );
	return ( pfn ? -ETIMEDOUT : -ECONNRESET );
}

/**
 * Issue admin queue command
 *
 * @v gve		GVE device
 * @ret rc		Return status code
 */
static int gve_admin ( struct gve_nic *gve ) {
	struct gve_admin *admin = &gve->admin;
	union gve_admin_command *cmd;
	unsigned int index;
	uint32_t opcode;
	uint32_t status;
	int rc;

	/* Ensure admin queue is idle */
	if ( ( rc = gve_admin_wait ( gve ) ) != 0 )
		return rc;

	/* Get next command slot */
	index = admin->prod;
	cmd = &admin->cmd[ index % GVE_ADMIN_COUNT ];
	opcode = cmd->hdr.opcode;
	DBGC2 ( gve, "GVE %p AQ %#02x command %#04x request:\n",
		gve, index, opcode );
	DBGC2_HDA ( gve, 0, cmd, sizeof ( *cmd ) );

	/* Increment producer counter */
	admin->prod++;

	/* Ring doorbell */
	wmb();
	writel ( bswap_32 ( admin->prod ), gve->cfg + GVE_CFG_ADMIN_DB );

	/* Wait for command to complete */
	if ( ( rc = gve_admin_wait ( gve ) ) != 0 )
		return rc;

	/* Check command status */
	status = be32_to_cpu ( cmd->hdr.status );
	if ( status != GVE_ADMIN_STATUS_OK ) {
		rc = -EIO_ADMIN ( status );
		DBGC ( gve, "GVE %p AQ %#02x command %#04x failed: %#08x\n",
		       gve, index, opcode, status );
		DBGC_HDA ( gve, 0, cmd, sizeof ( *cmd ) );
		DBGC ( gve, "GVE %p AQ error: %s\n", gve, strerror ( rc ) );
		return rc;
	}

	DBGC2 ( gve, "GVE %p AQ %#02x command %#04x result:\n",
		gve, index, opcode );
	DBGC2_HDA ( gve, 0, cmd, sizeof ( *cmd ) );
	return 0;
}

/**
 * Issue simple admin queue command
 *
 * @v gve		GVE device
 * @v opcode		Operation code
 * @v id		ID parameter (or zero if not applicable)
 * @ret rc		Return status code
 *
 * Several admin queue commands take either an empty parameter list or
 * a single 32-bit ID parameter.
 */
static int gve_admin_simple ( struct gve_nic *gve, unsigned int opcode,
			      unsigned int id ) {
	union gve_admin_command *cmd;
	int rc;

	/* Construct request */
	cmd = gve_admin_command ( gve );
	cmd->hdr.opcode = opcode;
	cmd->simple.id = cpu_to_be32 ( id );

	/* Issue command */
	if ( ( rc = gve_admin ( gve ) ) != 0 )
		return rc;

	return 0;
}

/**
 * Get device descriptor
 *
 * @v gve		GVE device
 * @ret rc		Return status code
 */
static int gve_describe ( struct gve_nic *gve ) {
	struct net_device *netdev = gve->netdev;
	struct gve_device_descriptor *desc = &gve->scratch.buf->desc;
	union gve_admin_command *cmd;
	struct gve_option *opt;
	unsigned int count;
	unsigned int id;
	size_t offset;
	size_t max;
	size_t len;
	int rc;

	/* Construct request */
	cmd = gve_admin_command ( gve );
	cmd->hdr.opcode = GVE_ADMIN_DESCRIBE;
	cmd->desc.addr = cpu_to_be64 ( dma ( &gve->scratch.map, desc ) );
	cmd->desc.ver = cpu_to_be32 ( GVE_ADMIN_DESCRIBE_VER );
	cmd->desc.len = cpu_to_be32 ( sizeof ( *desc ) );

	/* Issue command */
	if ( ( rc = gve_admin ( gve ) ) != 0 )
		return rc;
	DBGC2 ( gve, "GVE %p device descriptor:\n", gve );
	DBGC2_HDA ( gve, 0, desc, sizeof ( *desc ) );

	/* Extract queue parameters */
	gve->events.count = be16_to_cpu ( desc->counters );
	gve->tx.count = be16_to_cpu ( desc->tx_count );
	gve->rx.count = be16_to_cpu ( desc->rx_count );
	DBGC ( gve, "GVE %p using %d TX, %d RX, %d events\n",
	       gve, gve->tx.count, gve->rx.count, gve->events.count );

	/* Extract network parameters */
	build_assert ( sizeof ( desc->mac ) == ETH_ALEN );
	memcpy ( netdev->hw_addr, &desc->mac, sizeof ( desc->mac ) );
	netdev->mtu = be16_to_cpu ( desc->mtu );
	netdev->max_pkt_len = ( netdev->mtu + ETH_HLEN );
	DBGC ( gve, "GVE %p MAC %s (\"%s\") MTU %zd\n",
	       gve, eth_ntoa ( netdev->hw_addr ),
	       inet_ntoa ( desc->mac.in ), netdev->mtu );

	/* Parse options */
	count = be16_to_cpu ( desc->opt_count );
	max = be16_to_cpu ( desc->len );
	gve->options = 0;
	for ( offset = offsetof ( typeof ( *desc ), opts ) ; count ;
	      count--, offset += len ) {

		/* Check space for option header */
		if ( ( offset + sizeof ( *opt ) ) > max ) {
			DBGC ( gve, "GVE %p underlength option at +%#02zx:\n",
			       gve, offset );
			DBGC_HDA ( gve, 0, desc, sizeof ( *desc ) );
			return -EINVAL;
		}
		opt = ( ( ( void * ) desc ) + offset );

		/* Check space for option body */
		len = ( sizeof ( *opt ) + be16_to_cpu ( opt->len ) );
		if ( ( offset + len ) > max ) {
			DBGC ( gve, "GVE %p malformed option at +%#02zx:\n",
			       gve, offset );
			DBGC_HDA ( gve, 0, desc, sizeof ( *desc ) );
			return -EINVAL;
		}

		/* Record option as supported */
		id = be16_to_cpu ( opt->id );
		if ( id < ( 8 * sizeof ( gve->options ) ) )
			gve->options |= ( 1 << id );
	}
	DBGC ( gve, "GVE %p supports options %#08x\n", gve, gve->options );

	/* Select preferred operating mode */
	if ( gve->options & ( 1 << GVE_OPT_GQI_QPL ) ) {
		/* GQI-QPL: in-order queues, queue page list addressing */
		gve->mode = GVE_MODE_QPL;
	} else if ( gve->options & ( 1 << GVE_OPT_GQI_RDA ) ) {
		/* GQI-RDA: in-order queues, raw DMA addressing */
		gve->mode = 0;
	} else if ( gve->options & ( 1 << GVE_OPT_DQO_QPL ) ) {
		/* DQO-QPL: out-of-order queues, queue page list addressing */
		gve->mode = ( GVE_MODE_DQO | GVE_MODE_QPL );
	} else if ( gve->options & ( 1 << GVE_OPT_DQO_RDA ) ) {
		/* DQO-RDA: out-of-order queues, raw DMA addressing */
		gve->mode = GVE_MODE_DQO;
	} else {
		/* No options matched: assume the original GQI-QPL mode */
		gve->mode = GVE_MODE_QPL;
	}
	DBGC ( gve, "GVE %p using %s mode\n",
	       gve, gve_mode_name ( gve->mode ) );

	return 0;
}

/**
 * Configure device resources
 *
 * @v gve		GVE device
 * @ret rc		Return status code
 */
static int gve_configure ( struct gve_nic *gve ) {
	struct gve_events *events = &gve->events;
	struct gve_irqs *irqs = &gve->irqs;
	union gve_admin_command *cmd;
	uint32_t doorbell;
	unsigned int db_off;
	unsigned int i;
	int rc;

	/* Construct request */
	cmd = gve_admin_command ( gve );
	cmd->hdr.opcode = GVE_ADMIN_CONFIGURE;
	cmd->conf.events =
		cpu_to_be64 ( dma ( &events->map, events->event ) );
	cmd->conf.irqs =
		cpu_to_be64 ( dma ( &irqs->map, irqs->irq ) );
	cmd->conf.num_events = cpu_to_be32 ( events->count );
	cmd->conf.num_irqs = cpu_to_be32 ( GVE_IRQ_COUNT );
	cmd->conf.irq_stride = cpu_to_be32 ( sizeof ( irqs->irq[0] ) );
	cmd->conf.format = GVE_FORMAT ( gve->mode );

	/* Issue command */
	if ( ( rc = gve_admin ( gve ) ) != 0 )
		return rc;

	/* Disable all interrupts */
	doorbell = ( ( gve->mode & GVE_MODE_DQO ) ?
		     0 : bswap_32 ( GVE_GQI_IRQ_DISABLE ) );
	for ( i = 0 ; i < GVE_IRQ_COUNT ; i++ ) {
		db_off = ( be32_to_cpu ( irqs->irq[i].db_idx ) *
			   sizeof ( uint32_t ) );
		DBGC ( gve, "GVE %p IRQ %d doorbell +%#04x\n", gve, i, db_off );
		irqs->db[i] = ( gve->db + db_off );
		writel ( doorbell, irqs->db[i] );
	}

	return 0;
}

/**
 * Deconfigure device resources
 *
 * @v gve		GVE device
 * @ret rc		Return status code
 */
static int gve_deconfigure ( struct gve_nic *gve ) {
	int rc;

	/* Issue command (with meaningless ID) */
	if ( ( rc = gve_admin_simple ( gve, GVE_ADMIN_DECONFIGURE, 0 ) ) != 0 )
		return rc;

	return 0;
}

/**
 * Register queue page list
 *
 * @v gve		GVE device
 * @v qpl		Queue page list
 * @ret rc		Return status code
 */
static int gve_register ( struct gve_nic *gve, struct gve_qpl *qpl ) {
	struct gve_pages *pages = &gve->scratch.buf->pages;
	union gve_admin_command *cmd;
	void *addr;
	unsigned int i;
	int rc;

	/* Do nothing if using raw DMA addressing */
	if ( ! ( gve->mode & GVE_MODE_QPL ) )
		return 0;

	/* Build page address list */
	for ( i = 0 ; i < qpl->count ; i++ ) {
		addr = ( qpl->data + ( i * GVE_PAGE_SIZE ) );
		pages->addr[i] = cpu_to_be64 ( dma ( &qpl->map, addr ) );
	}

	/* Construct request */
	cmd = gve_admin_command ( gve );
	cmd->hdr.opcode = GVE_ADMIN_REGISTER;
	cmd->reg.id = cpu_to_be32 ( qpl->id );
	cmd->reg.count = cpu_to_be32 ( qpl->count );
	cmd->reg.addr = cpu_to_be64 ( dma ( &gve->scratch.map, pages ) );
	cmd->reg.size = cpu_to_be64 ( GVE_PAGE_SIZE );

	/* Issue command */
	if ( ( rc = gve_admin ( gve ) ) != 0 )
		return rc;

	return 0;
}

/**
 * Unregister page list
 *
 * @v gve		GVE device
 * @v qpl		Queue page list
 * @ret rc		Return status code
 */
static int gve_unregister ( struct gve_nic *gve, struct gve_qpl *qpl ) {
	int rc;

	/* Do nothing if using raw DMA addressing */
	if ( ! ( gve->mode & GVE_MODE_QPL ) )
		return 0;

	/* Issue command */
	if ( ( rc = gve_admin_simple ( gve, GVE_ADMIN_UNREGISTER,
				       qpl->id ) ) != 0 ) {
		return rc;
	}

	return 0;
}

/**
 * Construct command to create transmit queue
 *
 * @v queue		Transmit queue
 * @v qpl		Queue page list ID
 * @v cmd		Admin queue command
 */
static void gve_create_tx_param ( struct gve_queue *queue, uint32_t qpl,
				  union gve_admin_command *cmd ) {
	struct gve_admin_create_tx *create = &cmd->create_tx;
	const struct gve_queue_type *type = queue->type;

	/* Construct request parameters */
	create->res = cpu_to_be64 ( dma ( &queue->res_map, queue->res ) );
	create->desc =
		cpu_to_be64 ( dma ( &queue->desc_map, queue->desc.raw ) );
	create->qpl_id = cpu_to_be32 ( qpl );
	create->notify_id = cpu_to_be32 ( type->irq );
	create->desc_count = cpu_to_be16 ( queue->count );
	if ( queue->cmplt.raw ) {
		create->cmplt = cpu_to_be64 ( dma ( &queue->cmplt_map,
						    queue->cmplt.raw ) );
		create->cmplt_count = cpu_to_be16 ( queue->count );
	}
}

/**
 * Construct command to create receive queue
 *
 * @v queue		Receive queue
 * @v qpl		Queue page list ID
 * @v cmd		Admin queue command
 */
static void gve_create_rx_param ( struct gve_queue *queue, uint32_t qpl,
				  union gve_admin_command *cmd ) {
	struct gve_admin_create_rx *create = &cmd->create_rx;
	const struct gve_queue_type *type = queue->type;

	/* Construct request parameters */
	create->notify_id = cpu_to_be32 ( type->irq );
	create->res = cpu_to_be64 ( dma ( &queue->res_map, queue->res ) );
	create->desc =
		cpu_to_be64 ( dma ( &queue->desc_map, queue->desc.raw ) );
	create->cmplt =
		cpu_to_be64 ( dma ( &queue->cmplt_map, queue->cmplt.raw ) );
	create->qpl_id = cpu_to_be32 ( qpl );
	create->desc_count = cpu_to_be16 ( queue->count );
	create->bufsz = cpu_to_be16 ( GVE_BUF_SIZE );
	create->cmplt_count = cpu_to_be16 ( queue->count );
}

/**
 * Create transmit or receive queue
 *
 * @v gve		GVE device
 * @v queue		Descriptor queue
 * @ret rc		Return status code
 */
static int gve_create_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
	const struct gve_queue_type *type = queue->type;
	const struct gve_queue_stride *stride = &queue->stride;
	union gve_admin_command *cmd;
	struct gve_buffer *buf;
	unsigned int db_off;
	unsigned int evt_idx;
	unsigned int tag;
	unsigned int i;
	uint32_t qpl;
	int rc;

	/* Reset queue */
	queue->prod = 0;
	queue->cons = 0;
	queue->done = 0;
	memset ( queue->desc.raw, 0, ( queue->count * stride->desc ) );
	memset ( queue->cmplt.raw, 0, ( queue->count * stride->cmplt ) );
	for ( i = 0 ; i < queue->fill ; i++ )
		queue->tag[i] = i;

	/* Pre-populate descriptor offsets for in-order queues */
	if ( ! ( gve->mode & GVE_MODE_DQO ) ) {
		buf = ( queue->desc.raw + stride->desc - sizeof ( *buf ) );
		for ( i = 0 ; i < queue->count ; i++ ) {
			tag = ( i & ( queue->fill - 1 ) );
			buf->addr = cpu_to_be64 ( gve_address ( queue, tag ) );
			buf = ( ( ( void * ) buf ) + stride->desc );
		}
	}

	/* Construct request */
	cmd = gve_admin_command ( gve );
	cmd->hdr.opcode = type->create;
	qpl = ( ( gve->mode & GVE_MODE_QPL ) ? type->qpl : GVE_RAW_QPL );
	type->param ( queue, qpl, cmd );

	/* Issue command */
	if ( ( rc = gve_admin ( gve ) ) != 0 )
		return rc;

	/* Record indices */
	db_off = ( be32_to_cpu ( queue->res->db_idx ) * sizeof ( uint32_t ) );
	evt_idx = be32_to_cpu ( queue->res->evt_idx );
	DBGC ( gve, "GVE %p %s doorbell +%#04x event counter %d\n",
	       gve, type->name, db_off, evt_idx );
	queue->db = ( gve->db + db_off );
	assert ( evt_idx < gve->events.count );
	queue->event = &gve->events.event[evt_idx];
	assert ( queue->event->count == 0 );

	/* Unmask dummy interrupt */
	pci_msix_unmask ( &gve->msix, type->irq );

	/* Rearm queue interrupt if applicable */
	if ( gve->mode & GVE_MODE_DQO )
		writel ( GVE_DQO_IRQ_REARM, gve->irqs.db[type->irq] );

	return 0;
}

/**
 * Destroy transmit or receive queue
 *
 * @v gve		GVE device
 * @v queue		Descriptor queue
 * @ret rc		Return status code
 */
static int gve_destroy_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
	const struct gve_queue_type *type = queue->type;
	int rc;

	/* Mask dummy interrupt */
	pci_msix_mask ( &gve->msix, type->irq );

	/* Issue command */
	if ( ( rc = gve_admin_simple ( gve, type->destroy, 0 ) ) != 0 )
		return rc;

	return 0;
}

/******************************************************************************
 *
 * Network device interface
 *
 ******************************************************************************
 */

/**
 * Allocate shared queue resources
 *
 * @v gve		GVE device
 * @ret rc		Return status code
 */
static int gve_alloc_shared ( struct gve_nic *gve ) {
	struct dma_device *dma = gve->dma;
	struct gve_irqs *irqs = &gve->irqs;
	struct gve_events *events = &gve->events;
	size_t irqs_len = ( GVE_IRQ_COUNT * sizeof ( irqs->irq[0] ) );
	size_t events_len = ( gve->events.count * sizeof ( events->event[0] ) );
	int rc;

	/* Allocate interrupt channels */
	irqs->irq = dma_alloc ( dma, &irqs->map, irqs_len, GVE_ALIGN );
	if ( ! irqs->irq ) {
		rc = -ENOMEM;
		goto err_irqs;
	}
	DBGC ( gve, "GVE %p IRQs at [%08lx,%08lx)\n",
	       gve, virt_to_phys ( irqs->irq ),
	       ( virt_to_phys ( irqs->irq ) + irqs_len ) );

	/* Allocate event counters */
	events->event = dma_alloc ( dma, &events->map, events_len, GVE_ALIGN );
	if ( ! events->event ) {
		rc = -ENOMEM;
		goto err_events;
	}
	DBGC ( gve, "GVE %p events at [%08lx,%08lx)\n",
	       gve, virt_to_phys ( events->event ),
	       ( virt_to_phys ( events->event ) + events_len ) );

	return 0;

	dma_free ( &events->map, events->event, events_len );
 err_events:
	dma_free ( &irqs->map, irqs->irq, irqs_len );
 err_irqs:
	return rc;
}

/**
 * Free shared queue resources
 *
 * @v gve		GVE device
 */
static void gve_free_shared ( struct gve_nic *gve ) {
	struct gve_irqs *irqs = &gve->irqs;
	struct gve_events *events = &gve->events;
	size_t irqs_len = ( GVE_IRQ_COUNT * sizeof ( irqs->irq[0] ) );
	size_t events_len = ( gve->events.count * sizeof ( events->event[0] ) );

	/* Free event counters */
	dma_free ( &events->map, events->event, events_len );

	/* Free interrupt channels */
	dma_free ( &irqs->map, irqs->irq, irqs_len );
}

/**
 * Allocate queue page list
 *
 * @v gve		GVE device
 * @v qpl		Queue page list
 * @v id		Queue page list ID
 * @v buffers		Number of data buffers
 * @ret rc		Return status code
 */
static int gve_alloc_qpl ( struct gve_nic *gve, struct gve_qpl *qpl,
			   uint32_t id, unsigned int buffers ) {
	size_t len;

	/* Record ID */
	qpl->id = id;

	/* Calculate number of pages required */
	build_assert ( GVE_BUF_SIZE <= GVE_PAGE_SIZE );
	qpl->count = ( ( buffers + GVE_BUF_PER_PAGE - 1 ) / GVE_BUF_PER_PAGE );
	assert ( qpl->count <= GVE_QPL_MAX );

	/* Allocate pages (as a single block) */
	len = ( qpl->count * GVE_PAGE_SIZE );
	qpl->data = dma_umalloc ( gve->dma, &qpl->map, len, GVE_ALIGN );
	if ( ! qpl->data )
		return -ENOMEM;
	qpl->base = ( ( gve->mode == GVE_MODE_QPL ) ?
		      0 : dma ( &qpl->map, qpl->data ) );

	DBGC ( gve, "GVE %p QPL %#08x at [%08lx,%08lx)\n",
	       gve, qpl->id, virt_to_phys ( qpl->data ),
	       ( virt_to_phys ( qpl->data ) + len ) );
	return 0;
}

/**
 * Free queue page list
 *
 * @v gve		GVE device
 * @v qpl		Queue page list
 */
static void gve_free_qpl ( struct gve_nic *nic __unused,
			   struct gve_qpl *qpl ) {
	size_t len = ( qpl->count * GVE_PAGE_SIZE );

	/* Free pages */
	dma_ufree ( &qpl->map, qpl->data, len );
}

/**
 * Calculate next receive sequence number
 *
 * @v seq		Current sequence number, or zero to start sequence
 * @ret next		Next sequence number
 */
static inline __attribute__ (( always_inline )) unsigned int
gve_next ( unsigned int seq ) {

	/* The receive completion sequence number is a modulo 7
	 * counter that cycles through the non-zero three-bit values 1
	 * to 7 inclusive.
	 *
	 * Since 7 is coprime to 2^n, this ensures that the sequence
	 * number changes each time that a new completion is written
	 * to memory.
	 *
	 * Since the counter takes only non-zero values, this ensures
	 * that the sequence number changes whenever a new completion
	 * is first written to a zero-initialised completion ring.
	 */
	seq = ( ( seq + 1 ) & GVE_GQI_RX_SEQ_MASK );
	return ( seq ? seq : 1 );
}

/**
 * Allocate descriptor queue
 *
 * @v gve		GVE device
 * @v queue		Descriptor queue
 * @ret rc		Return status code
 */
static int gve_alloc_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
	const struct gve_queue_type *type = queue->type;
	struct gve_queue_stride *stride = &queue->stride;
	struct dma_device *dma = gve->dma;
	size_t desc_len;
	size_t cmplt_len;
	size_t res_len;
	int rc;

	/* Sanity checks */
	if ( ( queue->count == 0 ) ||
	     ( queue->count & ( queue->count - 1 ) ) ) {
		DBGC ( gve, "GVE %p %s invalid queue size %d\n",
		       gve, type->name, queue->count );
		rc = -EINVAL;
		goto err_sanity;
	}

	/* Set queue strides and calculate total lengths */
	*stride = ( ( gve->mode & GVE_MODE_DQO ) ?
		    type->stride.dqo : type->stride.gqi );
	desc_len = ( queue->count * stride->desc );
	cmplt_len = ( queue->count * stride->cmplt );
	res_len = sizeof ( *queue->res );

	/* Calculate maximum fill level */
	assert ( ( type->fill & ( type->fill - 1 ) ) == 0 );
	queue->fill = type->fill;
	if ( queue->fill > queue->count )
		queue->fill = queue->count;
	DBGC ( gve, "GVE %p %s using QPL %#08x with %d/%d descriptors\n",
	       gve, type->name, type->qpl, queue->fill, queue->count );

	/* Allocate queue page list */
	if ( ( rc = gve_alloc_qpl ( gve, &queue->qpl, type->qpl,
				    queue->fill ) ) != 0 )
		goto err_qpl;

	/* Allocate descriptors */
	queue->desc.raw = dma_umalloc ( dma, &queue->desc_map, desc_len,
					GVE_ALIGN );
	if ( ! queue->desc.raw ) {
		rc = -ENOMEM;
		goto err_desc;
	}
	DBGC ( gve, "GVE %p %s descriptors at [%08lx,%08lx)\n",
	       gve, type->name, virt_to_phys ( queue->desc.raw ),
	       ( virt_to_phys ( queue->desc.raw ) + desc_len ) );

	/* Allocate completions */
	if ( cmplt_len ) {
		queue->cmplt.raw = dma_umalloc ( dma, &queue->cmplt_map,
						 cmplt_len, GVE_ALIGN );
		if ( ! queue->cmplt.raw ) {
			rc = -ENOMEM;
			goto err_cmplt;
		}
		DBGC ( gve, "GVE %p %s completions at [%08lx,%08lx)\n",
		       gve, type->name, virt_to_phys ( queue->cmplt.raw ),
		       ( virt_to_phys ( queue->cmplt.raw ) + cmplt_len ) );
	}

	/* Allocate queue resources */
	queue->res = dma_alloc ( dma, &queue->res_map, res_len, GVE_ALIGN );
	if ( ! queue->res ) {
		rc = -ENOMEM;
		goto err_res;
	}
	memset ( queue->res, 0, res_len );

	return 0;

	dma_free ( &queue->res_map, queue->res, res_len );
 err_res:
	if ( cmplt_len )
		dma_ufree ( &queue->cmplt_map, queue->cmplt.raw, cmplt_len );
 err_cmplt:
	dma_ufree ( &queue->desc_map, queue->desc.raw, desc_len );
 err_desc:
	gve_free_qpl ( gve, &queue->qpl );
 err_qpl:
 err_sanity:
	return rc;
}

/**
 * Free descriptor queue
 *
 * @v gve		GVE device
 * @v queue		Descriptor queue
 */
static void gve_free_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
	const struct gve_queue_stride *stride = &queue->stride;
	size_t desc_len = ( queue->count * stride->desc );
	size_t cmplt_len = ( queue->count * stride->cmplt );
	size_t res_len = sizeof ( *queue->res );

	/* Free queue resources */
	dma_free ( &queue->res_map, queue->res, res_len );

	/* Free completions, if applicable */
	if ( cmplt_len )
		dma_ufree ( &queue->cmplt_map, queue->cmplt.raw, cmplt_len );

	/* Free descriptors */
	dma_ufree ( &queue->desc_map, queue->desc.raw, desc_len );

	/* Free queue page list */
	gve_free_qpl ( gve, &queue->qpl );
}

/**
 * Cancel any pending transmissions
 *
 * @v gve		GVE device
 */
static void gve_cancel_tx ( struct gve_nic *gve ) {
	struct net_device *netdev = gve->netdev;
	struct io_buffer *iobuf;
	unsigned int i;

	/* Cancel any pending transmissions */
	for ( i = 0 ; i < ( sizeof ( gve->tx_iobuf ) /
			    sizeof ( gve->tx_iobuf[0] ) ) ; i++ ) {
		iobuf = gve->tx_iobuf[i];
		gve->tx_iobuf[i] = NULL;
		if ( iobuf )
			netdev_tx_complete_err ( netdev, iobuf, -ECANCELED );
	}
}

/**
 * Start up device
 *
 * @v gve		GVE device
 * @ret rc		Return status code
 */
static int gve_start ( struct gve_nic *gve ) {
	struct gve_queue *tx = &gve->tx;
	struct gve_queue *rx = &gve->rx;
	int rc;

	/* Cancel any pending transmissions */
	gve_cancel_tx ( gve );

	/* Reset receive sequence */
	gve->seq = gve_next ( 0 );

	/* Configure device resources */
	if ( ( rc = gve_configure ( gve ) ) != 0 )
		goto err_configure;

	/* Register transmit queue page list */
	if ( ( rc = gve_register ( gve, &tx->qpl ) ) != 0 )
		goto err_register_tx;

	/* Register receive queue page list */
	if ( ( rc = gve_register ( gve, &rx->qpl ) ) != 0 )
		goto err_register_rx;

	/* Create transmit queue */
	if ( ( rc = gve_create_queue ( gve, tx ) ) != 0 )
		goto err_create_tx;

	/* Create receive queue */
	if ( ( rc = gve_create_queue ( gve, rx ) ) != 0 )
		goto err_create_rx;

	return 0;

	gve_destroy_queue ( gve, rx );
 err_create_rx:
	gve_destroy_queue ( gve, tx );
 err_create_tx:
	gve_unregister ( gve, &rx->qpl );
 err_register_rx:
	gve_unregister ( gve, &tx->qpl );
 err_register_tx:
	gve_deconfigure ( gve );
 err_configure:
	return rc;
}

/**
 * Stop device
 *
 * @v gve		GVE device
 */
static void gve_stop ( struct gve_nic *gve ) {
	struct gve_queue *tx = &gve->tx;
	struct gve_queue *rx = &gve->rx;

	/* Destroy queues */
	gve_destroy_queue ( gve, rx );
	gve_destroy_queue ( gve, tx );

	/* Unregister page lists */
	gve_unregister ( gve, &rx->qpl );
	gve_unregister ( gve, &tx->qpl );

	/* Deconfigure device */
	gve_deconfigure ( gve );
}

/**
 * Device startup process
 *
 * @v gve		GVE device
 */
static void gve_startup ( struct gve_nic *gve ) {
	struct net_device *netdev = gve->netdev;
	int rc;

	/* Reset device */
	if ( ( rc = gve_reset ( gve ) ) != 0 )
		goto err_reset;

	/* Enable admin queue */
	gve_admin_enable ( gve );

	/* Start device */
	if ( ( rc = gve_start ( gve ) ) != 0 )
		goto err_start;

	/* Reset retry count */
	gve->retries = 0;

	/* (Ab)use link status to report startup status */
	netdev_link_up ( netdev );

	return;

	gve_stop ( gve );
 err_start:
 err_reset:
	DBGC ( gve, "GVE %p startup failed: %s\n", gve, strerror ( rc ) );
	netdev_link_err ( netdev, rc );
	if ( gve->retries++ < GVE_RESET_MAX_RETRY )
		process_add ( &gve->startup );
}

/**
 * Trigger startup process
 *
 * @v gve		GVE device
 */
static void gve_restart ( struct gve_nic *gve ) {
	struct net_device *netdev = gve->netdev;

	/* Mark link down to inhibit polling and transmit activity */
	netdev_link_down ( netdev );

	/* Schedule startup process */
	process_add ( &gve->startup );
}

/**
 * Reset recovery watchdog
 *
 * @v timer		Reset recovery watchdog timer
 * @v over		Failure indicator
 */
static void gve_watchdog ( struct retry_timer *timer, int over __unused ) {
	struct gve_nic *gve = container_of ( timer, struct gve_nic, watchdog );
	uint32_t activity;
	uint32_t pfn;
	int rc;

	/* Reschedule watchdog */
	start_timer_fixed ( &gve->watchdog, GVE_WATCHDOG_TIMEOUT );

	/* Reset device (for test purposes) if applicable */
	if ( ( rc = inject_fault ( VM_MIGRATED_RATE ) ) != 0 ) {
		DBGC ( gve, "GVE %p synthesising host reset\n", gve );
		writel ( 0, gve->cfg + GVE_CFG_ADMIN_PFN );
	}

	/* Check for activity since last timer invocation */
	activity = ( gve->tx.cons + gve->rx.cons );
	if ( activity != gve->activity ) {
		gve->activity = activity;
		return;
	}

	/* Check for reset */
	pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
	if ( pfn ) {
		DBGC2 ( gve, "GVE %p idle but not in reset\n", gve );
		return;
	}

	/* Schedule restart */
	DBGC ( gve, "GVE %p watchdog detected reset by host\n", gve );
	gve_restart ( gve );
}

/**
 * Open network device
 *
 * @v netdev		Network device
 * @ret rc		Return status code
 */
static int gve_open ( struct net_device *netdev ) {
	struct gve_nic *gve = netdev->priv;
	struct gve_queue *tx = &gve->tx;
	struct gve_queue *rx = &gve->rx;
	int rc;

	/* Allocate shared queue resources */
	if ( ( rc = gve_alloc_shared ( gve ) ) != 0 )
		goto err_alloc_shared;

	/* Allocate and prepopulate transmit queue */
	if ( ( rc = gve_alloc_queue ( gve, tx ) ) != 0 )
		goto err_alloc_tx;

	/* Allocate and prepopulate receive queue */
	if ( ( rc = gve_alloc_queue ( gve, rx ) ) != 0 )
		goto err_alloc_rx;

	/* Trigger startup */
	gve_restart ( gve );

	/* Start reset recovery watchdog timer */
	start_timer_fixed ( &gve->watchdog, GVE_WATCHDOG_TIMEOUT );

	return 0;

	gve_free_queue ( gve, rx );
 err_alloc_rx:
	gve_free_queue ( gve, tx );
 err_alloc_tx:
	gve_free_shared ( gve );
 err_alloc_shared:
	return rc;
}

/**
 * Close network device
 *
 * @v netdev		Network device
 */
static void gve_close ( struct net_device *netdev ) {
	struct gve_nic *gve = netdev->priv;
	struct gve_queue *tx = &gve->tx;
	struct gve_queue *rx = &gve->rx;

	/* Stop reset recovery timer */
	stop_timer ( &gve->watchdog );

	/* Terminate startup process */
	process_del ( &gve->startup );

	/* Stop and reset device */
	gve_stop ( gve );
	gve_reset ( gve );

	/* Cancel any pending transmissions */
	gve_cancel_tx ( gve );

	/* Free queues */
	gve_free_queue ( gve, rx );
	gve_free_queue ( gve, tx );

	/* Free shared queue resources */
	gve_free_shared ( gve );
}

/**
 * Transmit packet
 *
 * @v netdev		Network device
 * @v iobuf		I/O buffer
 * @ret rc		Return status code
 */
static int gve_transmit ( struct net_device *netdev, struct io_buffer *iobuf ) {
	struct gve_nic *gve = netdev->priv;
	struct gve_queue *tx = &gve->tx;
	struct gve_gqi_tx_descriptor *gqi;
	struct gve_dqo_tx_descriptor *dqo;
	unsigned int count;
	unsigned int index;
	unsigned int tag;
	unsigned int chain;
	uint32_t doorbell;
	size_t frag_len;
	size_t offset;
	size_t next;
	size_t len;

	/* Do nothing if queues are not yet set up */
	if ( ! netdev_link_ok ( netdev ) )
		return -ENETDOWN;

	/* Defer packet if there is no space in the transmit ring */
	len = iob_len ( iobuf );
	count = ( ( len + GVE_BUF_SIZE - 1 ) / GVE_BUF_SIZE );
	if ( ( ( tx->prod - tx->cons ) + count ) > tx->fill ) {
		netdev_tx_defer ( netdev, iobuf );
		return 0;
	}

	/* Copy packet to queue pages and populate descriptors */
	for ( offset = 0, chain = 0 ; ; offset = next, chain = tag ) {

		/* Identify next available buffer */
		index = ( tx->prod++ & ( tx->count - 1 ) );
		tag = tx->tag[ index % GVE_TX_FILL ];

		/* Sanity check */
		assert ( gve->tx_iobuf[tag] == NULL );

		/* Copy packet fragment */
		frag_len = ( len - offset );
		if ( frag_len > GVE_BUF_SIZE )
			frag_len = GVE_BUF_SIZE;
		memcpy ( gve_buffer ( tx, tag ),
			 ( iobuf->data + offset ), frag_len );
		next = ( offset + frag_len );

		/* Populate descriptor */
		if ( gve->mode & GVE_MODE_DQO ) {

			/* Out-of-order descriptor */
			dqo = &tx->desc.tx.dqo[index];
			dqo->buf.addr =
				cpu_to_le64 ( gve_address ( tx, tag ) );
			if ( next == len ) {
				dqo->type = ( GVE_DQO_TX_TYPE_PACKET |
					      GVE_DQO_TX_TYPE_LAST );
				dqo->tag.id = tag;
				dqo->tag.count = count;
			} else {
				dqo->type = GVE_DQO_TX_TYPE_PACKET;
				dqo->tag.id = 0;
				dqo->tag.count = 0;
			}
			dqo->len = cpu_to_le16 ( frag_len );
			gve->tx_chain[tag] = chain;

		} else {

			/* In-order descriptor */
			gqi = &tx->desc.tx.gqi[index];
			if ( offset ) {
				gqi->type = GVE_GQI_TX_TYPE_CONT;
				gqi->count = 0;
				gqi->total = 0;
			} else {
				gqi->type = GVE_GQI_TX_TYPE_START;
				gqi->count = count;
				gqi->total = cpu_to_be16 ( len );
			}
			gqi->len = cpu_to_be16 ( frag_len );

		}
		DBGC2 ( gve, "GVE %p TXD %#04x %#02x:%#02x len %#04zx/%#04zx "
			"at %#08lx\n", gve, index, tag, count, frag_len, len,
			gve_address ( tx, tag ) );

		/* Record I/O buffer against final descriptor */
		if ( next == len ) {
			gve->tx_iobuf[tag] = iobuf;
			break;
		}
	}
	assert ( ( tx->prod - tx->cons ) <= tx->fill );

	/* Ring doorbell */
	doorbell = tx->prod;
	if ( gve->mode & GVE_MODE_DQO ) {
		doorbell &= ( tx->count - 1 );
	} else {
		doorbell = bswap_32 ( doorbell );
	}
	wmb();
	writel ( doorbell, tx->db );

	return 0;
}

/**
 * Poll for completed transmissions
 *
 * @v netdev		Network device
 */
static void gve_poll_tx ( struct net_device *netdev ) {
	struct gve_nic *gve = netdev->priv;
	struct gve_queue *tx = &gve->tx;
	struct gve_dqo_tx_completion *dqo;
	struct io_buffer *iobuf;
	unsigned int index;
	unsigned int gen;
	unsigned int bit;
	unsigned int tag;
	uint32_t count;

	/* Process transmit completions */
	if ( gve->mode & GVE_MODE_DQO ) {

		/* Out-of-order completions */
		while ( 1 ) {

			/* Read next possible completion */
			gen = ( tx->done & tx->count );
			index = ( tx->done & ( tx->count - 1 ) );
			dqo = &tx->cmplt.tx.dqo[index];

			/* Check generation bit */
			bit = ( dqo->flags & GVE_DQO_TXF_GEN );
			if ( ( !! bit ) == ( !! gen ) )
				break;
			rmb();
			tx->done++;

			/* Ignore non-packet completions */
			if ( ( ! ( dqo->flags & GVE_DQO_TXF_PKT ) ) ||
			     ( dqo->tag.count < 0 ) ) {
				DBGC2 ( gve, "GVE %p TXC %#04x flags %#02x "
					"ignored\n", gve, index, dqo->flags );
				continue;
			}

			/* Parse completion */
			tag = dqo->tag.id;
			count = dqo->tag.count;
			iobuf = gve->tx_iobuf[tag];
			gve->tx_iobuf[tag] = NULL;
			assert ( iobuf != NULL );

			/* Return completed descriptors to ring */
			while ( count-- ) {
				DBGC2 ( gve, "GVE %p TXC %#04x %#02x:%#02x "
					"complete\n", gve, index, tag,
					dqo->tag.count );
				tx->tag[ tx->cons++ % GVE_TX_FILL ] = tag;
				tag = gve->tx_chain[tag];
			}

			/* Hand off to network stack */
			if ( iobuf )
				netdev_tx_complete ( netdev, iobuf );
		}

	} else {

		/* Read event counter */
		count = be32_to_cpu ( tx->event->count );

		/* Process transmit completions */
		while ( count != tx->cons ) {
			DBGC2 ( gve, "GVE %p TXC %#04x complete\n",
				gve, tx->cons );
			tag = ( tx->cons % GVE_TX_FILL );
			iobuf = gve->tx_iobuf[tag];
			gve->tx_iobuf[tag] = NULL;
			tx->cons++;
			if ( iobuf )
				netdev_tx_complete ( netdev, iobuf );
		}
	}
}

/**
 * Poll for received packets
 *
 * @v netdev		Network device
 */
static void gve_poll_rx ( struct net_device *netdev ) {
	struct gve_nic *gve = netdev->priv;
	struct gve_queue *rx = &gve->rx;
	struct gve_gqi_rx_completion *gqi;
	struct gve_dqo_rx_completion *dqo;
	struct io_buffer *iobuf;
	unsigned int index;
	unsigned int gen;
	unsigned int bit;
	unsigned int seq;
	unsigned int tag;
	uint32_t done;
	size_t total;
	size_t len;
	int rc;

	/* Process receive completions */
	done = rx->done;
	seq = gve->seq;
	total = 0;
	while ( 1 ) {

		/* Read next possible completion */
		rc = 0;
		gen = ( done & rx->count );
		index = ( done++ & ( rx->count - 1 ) );
		if ( gve->mode & GVE_MODE_DQO ) {

			/* Out-of-order completion */
			dqo = &rx->cmplt.rx.dqo[index];

			/* Check generation bit */
			bit = ( dqo->len & cpu_to_le16 ( GVE_DQO_RXL_GEN ) );
			if ( ( !! bit ) == ( !! gen ) )
				break;
			rmb();

			/* Parse completion */
			len = ( le16_to_cpu ( dqo->len ) &
				( GVE_BUF_SIZE - 1 ) );
			tag = dqo->tag;
			DBGC2 ( gve, "GVE %p RXC %#04x %#02x:%#02x len %#04zx "
				"at %#08zx\n", gve, index, tag, dqo->flags,
				len, gve_offset ( rx, tag ) );

			/* Accumulate a complete packet */
			if ( dqo->status & GVE_DQO_RXS_ERROR ) {
				rc = -EIO;
				total = 0;
			} else {
				total += len;
				if ( ! ( dqo->flags & GVE_DQO_RXF_LAST ) )
					continue;
			}

		} else {

			/* In-order completion */
			gqi = &rx->cmplt.rx.gqi[index];

			/* Check sequence number */
			if ( ( gqi->seq & GVE_GQI_RX_SEQ_MASK ) != seq )
				break;
			rmb();
			seq = gve_next ( seq );

			/* Parse completion */
			len = be16_to_cpu ( gqi->len );
			tag = ( index % GVE_RX_FILL );
			DBGC2 ( gve, "GVE %p RXC %#04x %#02x:%#02x len %#04zx "
				"at %#08zx\n", gve, index, gqi->seq,
				gqi->flags, len, gve_offset ( rx, tag ) );

			/* Accumulate a complete packet */
			if ( gqi->flags & GVE_GQI_RXF_ERROR ) {
				rc = -EIO;
				total = 0;
			} else {
				total += len;
				if ( gqi->flags & GVE_GQI_RXF_MORE )
					continue;
			}
			gve->seq = seq;
		}

		/* Allocate and populate I/O buffer */
		iobuf = ( total ? alloc_iob ( total ) : NULL );
		for ( ; rx->done != done ; rx->done++ ) {

			/* Re-read completion and return tag to ring */
			index = ( rx->done & ( rx->count - 1 ) );
			if ( gve->mode & GVE_MODE_DQO ) {
				dqo = &rx->cmplt.rx.dqo[index];
				tag = dqo->tag;
				len = ( le16_to_cpu ( dqo->len ) &
					( GVE_BUF_SIZE - 1 ) );
				rx->tag[ rx->cons++ % GVE_RX_FILL ] = tag;
			} else {
				gqi = &rx->cmplt.rx.gqi[index];
				tag = ( index % GVE_RX_FILL );
				len = be16_to_cpu ( gqi->len );
				assert ( rx->cons == rx->done );
				rx->cons++;
			}

			/* Copy data */
			if ( iobuf ) {
				memcpy ( iob_put ( iobuf, len ),
					 gve_buffer ( rx, tag ), len );
			}
		}
		assert ( ( iobuf == NULL ) || ( iob_len ( iobuf ) == total ) );
		total = 0;

		/* Hand off packet to network stack */
		if ( iobuf ) {
			if ( ! ( gve->mode & GVE_MODE_DQO ) )
				iob_pull ( iobuf, GVE_GQI_RX_PAD );
			netdev_rx ( netdev, iobuf );
		} else {
			netdev_rx_err ( netdev, NULL, ( rc ? rc : -ENOMEM ) );
		}
	}
}

/**
 * Refill receive queue
 *
 * @v netdev		Network device
 */
static void gve_refill_rx ( struct net_device *netdev ) {
	struct gve_nic *gve = netdev->priv;
	struct gve_queue *rx = &gve->rx;
	struct gve_dqo_rx_descriptor *dqo;
	unsigned int refill;
	unsigned int index;
	unsigned int tag;
	uint32_t doorbell;

	/* Calculate refill quantity */
	doorbell = ( rx->cons + rx->fill );
	refill = ( doorbell - rx->prod );
	if ( ! refill )
		return;

	/* Refill ring */
	if ( gve->mode & GVE_MODE_DQO ) {

		/* Out-of-order descriptors */
		while ( refill-- ) {

			/* Identify next available buffer */
			index = ( rx->prod++ & ( rx->count - 1 ) );
			tag = rx->tag[ index % GVE_RX_FILL ];

			/* Populate descriptor */
			dqo = &rx->desc.rx.dqo[index];
			dqo->tag = tag;
			dqo->buf.addr =
				cpu_to_le64 ( gve_address ( rx, tag ) );
			DBGC2 ( gve, "GVE %p RXD %#04x:%#02x at %#08llx\n",
				gve, index, dqo->tag,
				( ( unsigned long long )
				  le64_to_cpu ( dqo->buf.addr ) ) );
		}
		wmb();
		assert ( rx->prod == doorbell );

	} else {

		/* The in-order receive descriptors are prepopulated
		 * at the time of creating the receive queue (pointing
		 * to the preallocated queue pages).  Refilling is
		 * therefore just a case of ringing the doorbell if
		 * the device is not yet aware of any available
		 * descriptors.
		 */
		rx->prod += refill;
		assert ( rx->prod == doorbell );
		DBGC2 ( gve, "GVE %p RXD %#04x ready\n", gve, rx->prod );

		/* Doorbell is big-endian */
		doorbell = bswap_32 ( doorbell );
	}

	/* Ring doorbell */
	writel ( doorbell, rx->db );
}

/**
 * Poll for completed and received packets
 *
 * @v netdev		Network device
 */
static void gve_poll ( struct net_device *netdev ) {
	struct gve_nic *gve = netdev->priv;

	/* Do nothing if queues are not yet set up */
	if ( ! netdev_link_ok ( netdev ) )
		return;

	/* Poll for transmit completions */
	gve_poll_tx ( netdev );

	/* Poll for receive completions */
	gve_poll_rx ( netdev );

	/* Refill receive queue */
	gve_refill_rx ( netdev );

	/* Rearm queue interrupts if applicable */
	if ( gve->mode & GVE_MODE_DQO ) {
		writel ( GVE_DQO_IRQ_REARM, gve->irqs.db[GVE_TX_IRQ] );
		writel ( GVE_DQO_IRQ_REARM, gve->irqs.db[GVE_RX_IRQ] );
	}
}

/** GVE network device operations */
static struct net_device_operations gve_operations = {
	.open		= gve_open,
	.close		= gve_close,
	.transmit	= gve_transmit,
	.poll		= gve_poll,
};

/******************************************************************************
 *
 * PCI interface
 *
 ******************************************************************************
 */

/** Transmit descriptor queue type */
static const struct gve_queue_type gve_tx_type = {
	.name = "TX",
	.param = gve_create_tx_param,
	.qpl = GVE_TX_QPL,
	.irq = GVE_TX_IRQ,
	.fill = GVE_TX_FILL,
	.stride = {
		.gqi = {
			.desc = sizeof ( struct gve_gqi_tx_descriptor ),
		},
		.dqo = {
			.desc = sizeof ( struct gve_dqo_tx_descriptor ),
			.cmplt = sizeof ( struct gve_dqo_tx_completion ),
		},
	},
	.create = GVE_ADMIN_CREATE_TX,
	.destroy = GVE_ADMIN_DESTROY_TX,
};

/** Receive descriptor queue type */
static const struct gve_queue_type gve_rx_type = {
	.name = "RX",
	.param = gve_create_rx_param,
	.qpl = GVE_RX_QPL,
	.irq = GVE_RX_IRQ,
	.fill = GVE_RX_FILL,
	.stride = {
		.gqi = {
			.desc = sizeof ( struct gve_gqi_rx_descriptor ),
			.cmplt = sizeof ( struct gve_gqi_rx_completion ),
		},
		.dqo = {
			.desc = sizeof ( struct gve_dqo_rx_descriptor ),
			.cmplt = sizeof ( struct gve_dqo_rx_completion ),
		},
	},
	.create = GVE_ADMIN_CREATE_RX,
	.destroy = GVE_ADMIN_DESTROY_RX,
};

/**
 * Set up admin queue and get device description
 *
 * @v gve		GVE device
 * @ret rc		Return status code
 */
static int gve_setup ( struct gve_nic *gve ) {
	unsigned int i;
	int rc;

	/* Attempt several times, since the device may decide to add
	 * in a few spurious resets.
	 */
	for ( i = 0 ; i < GVE_RESET_MAX_RETRY ; i++ ) {

		/* Reset device */
		if ( ( rc = gve_reset ( gve ) ) != 0 )
			continue;

		/* Enable admin queue */
		gve_admin_enable ( gve );

		/* Fetch MAC address */
		if ( ( rc = gve_describe ( gve ) ) != 0 )
			continue;

		/* Success */
		return 0;
	}

	DBGC ( gve, "GVE %p failed to get device description: %s\n",
	       gve, strerror ( rc ) );
	return rc;
}

/** Device startup process descriptor */
static struct process_descriptor gve_startup_desc =
	PROC_DESC_ONCE ( struct gve_nic, startup, gve_startup );

/**
 * Probe PCI device
 *
 * @v pci		PCI device
 * @ret rc		Return status code
 */
static int gve_probe ( struct pci_device *pci ) {
	struct net_device *netdev;
	struct gve_nic *gve;
	unsigned long cfg_start;
	unsigned long db_start;
	unsigned long db_size;
	int rc;

	/* Allocate and initialise net device */
	netdev = alloc_etherdev ( sizeof ( *gve ) );
	if ( ! netdev ) {
		rc = -ENOMEM;
		goto err_alloc;
	}
	netdev_init ( netdev, &gve_operations );
	gve = netdev->priv;
	pci_set_drvdata ( pci, netdev );
	netdev->dev = &pci->dev;
	memset ( gve, 0, sizeof ( *gve ) );
	gve->netdev = netdev;
	gve->tx.type = &gve_tx_type;
	gve->rx.type = &gve_rx_type;
	gve->tx.tag = gve->tx_tag;
	gve->rx.tag = gve->rx_tag;
	process_init_stopped ( &gve->startup, &gve_startup_desc,
			       &netdev->refcnt );
	timer_init ( &gve->watchdog, gve_watchdog, &netdev->refcnt );

	/* Fix up PCI device */
	adjust_pci_device ( pci );

	/* Check PCI revision */
	pci_read_config_byte ( pci, PCI_REVISION, &gve->revision );
	DBGC ( gve, "GVE %p is revision %#02x\n", gve, gve->revision );

	/* Map configuration registers */
	cfg_start = pci_bar_start ( pci, GVE_CFG_BAR );
	gve->cfg = pci_ioremap ( pci, cfg_start, GVE_CFG_SIZE );
	if ( ! gve->cfg ) {
		rc = -ENODEV;
		goto err_cfg;
	}

	/* Map doorbell registers */
	db_start = pci_bar_start ( pci, GVE_DB_BAR );
	db_size = pci_bar_size ( pci, GVE_DB_BAR );
	gve->db = pci_ioremap ( pci, db_start, db_size );
	if ( ! gve->db ) {
		rc = -ENODEV;
		goto err_db;
	}

	/* Configure DMA */
	gve->dma = &pci->dma;
	dma_set_mask_64bit ( gve->dma );
	assert ( netdev->dma == NULL );

	/* Configure dummy MSI-X interrupt */
	if ( ( rc = pci_msix_enable ( pci, &gve->msix ) ) != 0 )
		goto err_msix;

	/* Allocate admin queue */
	if ( ( rc = gve_admin_alloc ( gve ) ) != 0 )
		goto err_admin;

	/* Set up the device */
	if ( ( rc = gve_setup ( gve ) ) != 0 )
		goto err_setup;

	/* Register network device */
	if ( ( rc = register_netdev ( netdev ) ) != 0 )
		goto err_register_netdev;

	return 0;

	unregister_netdev ( netdev );
 err_register_netdev:
 err_setup:
	gve_reset ( gve );
	gve_admin_free ( gve );
 err_admin:
	pci_msix_disable ( pci, &gve->msix );
 err_msix:
	iounmap ( gve->db );
 err_db:
	iounmap ( gve->cfg );
 err_cfg:
	netdev_nullify ( netdev );
	netdev_put ( netdev );
 err_alloc:
	return rc;
}

/**
 * Remove PCI device
 *
 * @v pci		PCI device
 */
static void gve_remove ( struct pci_device *pci ) {
	struct net_device *netdev = pci_get_drvdata ( pci );
	struct gve_nic *gve = netdev->priv;

	/* Unregister network device */
	unregister_netdev ( netdev );

	/* Reset device */
	gve_reset ( gve );

	/* Free admin queue */
	gve_admin_free ( gve );

	/* Disable dummy MSI-X interrupt */
	pci_msix_disable ( pci, &gve->msix );

	/* Unmap registers */
	iounmap ( gve->db );
	iounmap ( gve->cfg );

	/* Free network device */
	netdev_nullify ( netdev );
	netdev_put ( netdev );
}

/** GVE PCI device IDs */
static struct pci_device_id gve_nics[] = {
	PCI_ROM ( 0x1ae0, 0x0042, "gve", "gVNIC", 0 ),
};

/** GVE PCI driver */
struct pci_driver gve_driver __pci_driver = {
	.ids = gve_nics,
	.id_count = ( sizeof ( gve_nics ) / sizeof ( gve_nics[0] ) ),
	.probe = gve_probe,
	.remove = gve_remove,
};
